@article{DBLP:journals/corr/Ruder16,
  author    = {Sebastian Ruder},
  title     = {An overview of gradient descent optimization algorithms},
  journal   = {CoRR},
  volume    = {abs/1609.04747},
  year      = {2016},
  url       = {http://arxiv.org/abs/1609.04747},
  timestamp = {Wed, 07 Jun 2017 14:40:06 +0200},
  biburl    = {http://dblp.uni-trier.de/rec/bib/journals/corr/Ruder16},
  bibsource = {dblp computer science bibliography, http://dblp.org}
},

@article{Duchi:2011:ASM:1953048.2021068,
 author = {Duchi, John and Hazan, Elad and Singer, Yoram},
 title = {Adaptive Subgradient Methods for Online Learning and Stochastic Optimization},
 journal = {J. Mach. Learn. Res.},
 issue_date = {2/1/2011},
 volume = {12},
 month = jul,
 year = {2011},
 issn = {1532-4435},
 pages = {2121--2159},
 numpages = {39},
 url = {http://dl.acm.org/citation.cfm?id=1953048.2021068},
 acmid = {2021068},
 publisher = {JMLR.org},
},

@article{DBLP:journals/corr/abs-1212-5701,
  author    = {Matthew D. Zeiler},
  title     = {{ADADELTA:} An Adaptive Learning Rate Method},
  journal   = {CoRR},
  volume    = {abs/1212.5701},
  year      = {2012},
  url       = {http://arxiv.org/abs/1212.5701},
  timestamp = {Wed, 07 Jun 2017 14:43:02 +0200},
  biburl    = {http://dblp.uni-trier.de/rec/bib/journals/corr/abs-1212-5701},
  bibsource = {dblp computer science bibliography, http://dblp.org}
},

@article{DBLP:journals/corr/KingmaB14,
  author    = {Diederik P. Kingma and
               Jimmy Ba},
  title     = {Adam: {A} Method for Stochastic Optimization},
  journal   = {CoRR},
  volume    = {abs/1412.6980},
  year      = {2014},
  url       = {http://arxiv.org/abs/1412.6980},
  timestamp = {Wed, 07 Jun 2017 14:40:52 +0200},
  biburl    = {http://dblp.uni-trier.de/rec/bib/journals/corr/KingmaB14},
  bibsource = {dblp computer science bibliography, http://dblp.org}
},

@misc{Tieleman2012,
  title={{Lecture 6.5---RmsProp: Divide the gradient by a running average of its recent magnitude}},
  author={Tieleman, T. and Hinton, G.},
  howpublished={COURSERA: Neural Networks for Machine Learning},
  year={2012}
},


@article{journals/nn/Qian99,
  added-at = {2005-11-16T00:00:00.000+0100},
  author = {Qian, Ning},
  biburl = {https://www.bibsonomy.org/bibtex/25467c3fc1e5a8200fc01310208258c53/dblp},
  date = {2005-11-16},
  description = {dblp},
  ee = {http://dx.doi.org/10.1016/S0893-6080(98)00116-6},
  interhash = {2b93b2cc86fc9b2dc20e2e367344acb4},
  intrahash = {5467c3fc1e5a8200fc01310208258c53},
  journal = {Neural Networks},
  keywords = {dblp},
  number = 1,
  pages = {145-151},
  timestamp = {2005-11-16T00:00:00.000+0100},
  title = {On the momentum term in gradient descent learning algorithms.},
  url = {http://dblp.uni-trier.de/db/journals/nn/nn12.html#Qian99},
  volume = 12,
  year = 1999
}

